# This script integrates the CAPE predictors and the genomic risk scores. 
# This script gives the example code for integrating the CAPE model with the PRS, but the input datasets can be easily adapted for the integration of the CAPE model with the newborn and childhood MRSs (with or without the PRS).
# Newborn MRS - data found in IOWBC_MRS_data.xlsx, sheet: "IOWBC nMRS"
# Childhood MRS - data found in IOWBC_MRS_data.xlsx, sheet: "IOWBC cMRS"
# Integrated datasets for the CAPE+nMRS, CAPE+cMRS and CAPE+PRS+nMRS, and CAPE+PRS+cMRS can be found in IOWBC_CAPE_integrated_data.xlsx
# - note. in IOWBC_CAPP_integrated_data.xlsx, data for the integrated newborn and childhood MRS with the CAPE model can be found within the same spreadsheet. Therefore, if developing the CAPE+nMRS integrated model, the cMRS will need to be deleted from the dataset, and vice versa.
# The integrated model was developed using the same machine learning algorithm and training dataset characteristics as the best CAPE model 
# - i.e. SVM algorithm (RBF kernel) trained on the complete training dataset, oversampled 0%, undersampled 
# To ensure sufficient numbers of individuals for training, a new training-test set split was performed rather than subsetting those with complete data from the initial CAPE training set:
# Integrate data >> standardise >> train-test split >> complete, 0% oversampled & undersampled dataset >> rbf svm


# Imports
import os
import pandas as pd
import numpy as np
from time import time
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, roc_auc_score, roc_curve
from collections import Counter
from sklearn.utils import shuffle
from sklearn.metrics import balanced_accuracy_score, average_precision_score, f1_score
from imblearn.over_sampling import ADASYN
from numpy import argmax, arange
import pickle
# Classifiers
from sklearn.svm import SVC

# Set working directory
os.chdir("/../../..")

#### Define function to evaluate performance measures
def performance(y_test, y_pred, y_probs):
	cm_test = confusion_matrix(y_test, y_pred)	
	test_report = classification_report(y_test, y_pred)
	accuracy = accuracy_score(y_test, y_pred)
	balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
	sensitivity =  cm_test[1,1]/(cm_test[1,0]+cm_test[1,1])								
	specificity = cm_test[0,0]/(cm_test[0,0]+cm_test[0,1])
	PPV = cm_test[1,1]/(cm_test[1,1]+cm_test[0,1])
	NPV = cm_test[0,0]/(cm_test[0,0]+cm_test[1,0])
	LRp = sensitivity/(1-specificity)
	LRn = (1-sensitivity)/specificity
	F1 = f1_score(y_test, y_pred)
	ROCAUC = roc_auc_score(y_test, y_probs)
	PR_AUC = average_precision_score(y_test, y_probs)
	print (cm_test)
	print (test_report)
	print('accuracy:=%f' % (accuracy))
	print('balanced_accuracy:=%f' % (balanced_accuracy))
	print('Sensitivity:=%f' % (sensitivity))
	print('Specificity:=%f' % (specificity))
	print('PPV:=%f' % (PPV))
	print('NPV:=%f' % (NPV))
	print('LRp:=%f' % (LRp))
	print('LRn:=%f' % (LRn))
	print('ROCAUC:=%f' % (ROCAUC))
	print('PR_AUC:=%f' % (PR_AUC))
	return
	
	
#########################################
### Early-life MODEL - CLINICAL + PRS ###
#########################################
# Load cleaned, unstandardised features used in the CAPE model - data found in IOWBC_data.xlsx, sheet: "Early life data"
data = pd.read_csv("Early_life_QC_1368IDs.csv", index_col=False)
del data['Unnamed: 0']

# Subset 8 variables included in the model
data1 = data[['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES', 'Asthma_10YR']]

# Add PRS data to the dataset - data found in IOWBC_PRS_data.xlsx, sheet: "IOWBC PRS"
PRS = pd.read_csv("PRS_116snp_Asthma10YR_Adjusted.csv", index_col=False)
del PRS['In_Regression']
# 924 IDs 
PRS.rename(columns={'IID':'Study_ID'}, inplace=True)
PRS = PRS.dropna()
Counter(PRS.Asthma_10YR)
# 908 samples should have PRS data - 767 controls, 141 cases

all_data = data1.merge(PRS, how='outer', on='Study_ID')
all_data.isnull().sum()

# Save integrated dataset - data found in IOWBC_integrated_data.xlsx, sheet: "CAPE+PRS"
all_data.to_csv("CAPE_PRS_model_data_1368ID.csv")

all_data = all_data.dropna()
# 538 IDs remain

all_data = all_data[['Study_ID', 'Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1','PRS',
       'Total.Bf.duration', 'Wheeze_2YR', 'Cough_2YR', 'SES', 'Asthma_10YR_x']]
# Split data into training and test sets
	# Attempt to create a fresh split, in a 2:1 ratio to try and increase the number of cases. When splitting based on the original test sets, the models overfit so will check this way. 
	
#create data_features
complete_subset_features = all_data.drop(['Asthma_10YR_x'], axis=1)

#create data_outcome
complete_subset_outcome = all_data['Asthma_10YR_x']

# Split dataset into training set and test set: 66.6% training and 33.3% test
X_train, X_test, y_train, y_test = train_test_split(complete_subset_features, complete_subset_outcome,
                                                    stratify=complete_subset_outcome, 
                                                    test_size=0.333, shuffle=True, random_state=123)
													
# Training set (n=358, asthma=49, no asthma=309)	Test set (n=180, asthma=24, no asthma=156)

# Save the original train/test set IDs
Train_IDs = X_train.iloc[:,0]
Train_IDs = Train_IDs.to_frame()
Test_IDs = X_test.iloc[:,0]
Test_IDs = Test_IDs.to_frame()

# delete Study Id columns from training and test sets
del X_train['Study_ID']
del X_test['Study_ID']

# Standardise training and test sets
cont = X_train[['Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'PRS']]

scaler = StandardScaler()
cont_train = pd.DataFrame(scaler.fit_transform(X_train.iloc[:,0:5]), columns=('Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'PRS'))
cat_train = X_train.iloc[:,5:]
SX_train = pd.concat([cont_train, cat_train.reset_index(drop=True)], axis=1)
SXY_train = pd.concat([Train_IDs.reset_index(drop=True), SX_train], axis=1)
SXY_train = pd.concat([SXY_train, y_train.reset_index(drop=True)], axis=1)
#SXY_train.to_csv("CAPE_PRS_standardised_training_dataset_358ID.csv") - data found in IOWBC_CAPE_integrated_data.xlsx, sheet: "CAPE+PRS standardised training"

cont_test = pd.DataFrame(scaler.transform(X_test.iloc[:,0:5]), columns=('Mat_age', 'Birthweight', 'Solid_food', 'SDS_BMI_1', 'PRS'))
cat_test = X_test.iloc[:,5:]
SX_test = pd.concat([cont_test, cat_test.reset_index(drop=True)], axis=1)
SXY_test = pd.concat([Test_IDs.reset_index(drop=True), SX_test], axis=1)
SXY_test = pd.concat([SXY_test, y_test.reset_index(drop=True)], axis=1)
#SXY_test.to_csv("CAPE_PRS_standardised_test_dataset_180ID.csv") - data found in IOWBC_CAPE_integrated_data.xlsx, sheet: "CAPE+PRS standardised test"

### Apply CAPE training set characteristics ###
# Over/undersampled training data as required - 0% oversampling, undersampling
Counter(SXY_train.Asthma_10YR_x)
#Counter({0: 309, 1: 49})

# Undersample the controls 
s1 = SXY_train.loc[SXY_train['Asthma_10YR_x'] == 1]
s0 = SXY_train.loc[SXY_train['Asthma_10YR_x'] == 0]
s0 = shuffle(s0, random_state=123)
s0 = s0.iloc[:49,]
SXY_train_OU = s1.append(pd.DataFrame(data = s0), ignore_index=True)
SXY_train_OU = shuffle(SXY_train_OU, random_state=123)
print('Original dataset shape %s' % Counter(SXY_train_OU.Asthma_10YR_x))
# Original dataset shape Counter({0: 49, 1: 49})

X_train = SXY_train_OU.iloc[:,1:-1]
y_train = SXY_train_OU.iloc[:,-1]

### Model development ###
# Define a rbf svm classifier 
# Random search
clf = SVC(kernel='rbf', probability=True, random_state=123)
C_range = np.logspace(-3,2,100)
gamma_range = np.logspace(-3, 2, 100)
param_grid = dict(gamma=gamma_range, C=C_range)

random_search = RandomizedSearchCV(clf, scoring='balanced_accuracy',param_distributions=param_grid,
									n_iter=100, n_jobs=-1, cv=StratifiedKFold(5), random_state=123)
start = time()
random_search.fit(X_train, y_train)
RStime = (time() - start)
best_parameters = random_search.best_params_
print(best_parameters)
#{'gamma': 0.02595024211399736, 'C': 2.1544346900318843}
best_score = random_search.best_score_
print(best_score)
#0.5714285714285714

# Grid search
clf = SVC(kernel='rbf', probability=True, random_state=123)
C_range = np.arange(0.01, 5, 0.01)
gamma_range = np.arange(0.001, 0.101, 0.001)
param_grid = dict(C=C_range, gamma=gamma_range)
grid_search = GridSearchCV(clf, scoring='balanced_accuracy', param_grid=param_grid, cv=StratifiedKFold(5), n_jobs=16)
start = time()
grid_search.fit(X_train, y_train)
GStime = (time() - start)
# Get Grid search results
Candidates = len(grid_search.cv_results_['params'])
print(Candidates)
# best parameters
best_parameters = grid_search.best_params_
print(best_parameters)
#{'C': 2.31, 'gamma': 0.014000000000000002}

best_score = grid_search.best_score_
print(best_score)
#0.5918367346938775

results=pd.DataFrame(grid_search.cv_results_)
filename = "CAPE_PRS_rbf_svm_grid_search_results.csv"
#results.to_csv(filename,index=False)

# Build best model
best_clf = SVC(kernel='rbf', C=2.31, gamma=0.014, probability=True, random_state=123)

# Fit optimised model
best_clf.fit(X_train,y_train)

### Training set Performance
y_train_pred = best_clf.predict(X_train)

probs = best_clf.predict_proba(X_train)
preds = probs[:,1]
performance(y_train, y_train_pred, preds)
ROCAUC_train = roc_auc_score(y_train, preds)
print(ROCAUC_train)
#0.7542690545605998

# Evaluate model in test set
probs = model.predict_proba(SX_test)
preds = probs[:,1]
ROCAUC_test = roc_auc_score(y_test, preds)
print(ROCAUC_test)
#0.6485042735042734

y_pred = best_clf.predict(SX_test)
performance(y_test, y_pred,preds)


#### Identify optimal threshold based on Youden's index ####
test_probs = best_clf.predict_proba(SX_test)
# keep probabilities for the positive outcome only
test_preds = test_probs[:,1]
# calculate roc curves
fpr, tpr, thresholds = roc_curve(y_test, test_preds)
# get the best threshold
J = tpr - fpr
ix = argmax(J)
best_thresh = thresholds[ix]
print('Best Threshold=%f' % (best_thresh))
# 0.4762951954740759

# Obtain classifications based on optimal threshold cutoff
probs_opt = best_clf.predict_proba(SX_test)
SX_test['preds'] = probs_opt[:,1]
pred_opt = SX_test['preds'].map(lambda x: 1 if x >= best_thresh else 0)

# Check performance in test set
performance(y_test, pred_opt,SX_test['preds'])		

# save the model to disk
filename = 'CAPE_PRS_rbfSVM_COU0.sav'
pickle.dump(best_clf, open(filename, 'wb'))